dir.create(path = results_outpath)
library('SingleCellExperiment')
library('Seurat')
library('dplyr')
library('ggplot2')
dat <- read.table(file = data_inpath, sep = ',', stringsAsFactors = FALSE, row.names = 1, header = TRUE)
dat <- Matrix::Matrix(data = as.matrix(dat), sparse = TRUE)
dat <- round(dat) # for some reason, counts are non-integers....
dat_sce <- SingleCellExperiment(assay = list('counts' = dat))
dat <- CreateSeuratObject(counts = dat, project = 'SVZ', assay = 'RNA')
# For SCE objects:
# dat[['sample']] <- sapply(X = strsplit(x = colnames(dat), split = '_'), FUN = `[`, 1)
tmp <- dat@meta.data
tmp <- tmp[sample(x = nrow(tmp), size = nrow(tmp)),]
plot(x = tmp$nCount_RNA,
y = tmp$nFeature_RNA,
xlab = 'total UMI',
ylab = '# genes detected',
main = 'Detection rates by sample',
sub = 'Black = SCtrl; Red = S1d; Green = S7d',
col = sapply(X = as.character(tmp$orig.ident), FUN = function(x) switch(x, 'SCtrl' = 'black', 'S1d' = 'red3', 'S7d' = 'green')))
From the above, we see that there are a few cells with outlier numbers of total UMI and genes detected. These are likely to be doublets. Beyond that, there are no obvious biases based on sample of origin.
Next we compute gene-level and cell-level QC metrics.
dat <- PercentageFeatureSet(dat, pattern = '^mt-', col.name = 'percent.mt')
dat_sce <- as.SingleCellExperiment(x = dat)
mito_gene <- grep(pattern = '^mt-', x = rownames(dat_sce), value = TRUE)
dat_sce <- scater::calculateQCMetrics(object = dat_sce, exprs_values = 'counts', feature_controls = list('percent.mt' = mito_gene))
tmp <- as.data.frame(colData(dat_sce)[c('log10_total_counts',
'log10_total_features_by_counts',
'pct_counts_in_top_50_features',
'pct_counts_in_top_100_features',
'pct_counts_in_top_200_features',
'pct_counts_in_top_500_features',
'log10_total_counts_percent.mt')])
dat@meta.data <- cbind(dat@meta.data, tmp)
qc_metrics_plot <- dat@meta.data %>%
reshape2::melt(id.vars = 'orig.ident') %>%
ggplot(mapping = aes(x = orig.ident, y = value)) +
geom_violin(scale = 'width') +
ggbeeswarm::geom_quasirandom(size = 0.25) +
facet_wrap(. ~ variable, scales = 'free_y') +
labs(title = 'Quality control metrics') +
theme(axis.title = element_blank(),
plot.title = element_text(size = 18, face = 'bold'),
axis.line.y = element_line(size = 1),
axis.ticks.y = element_line(size = 1))
qc_metrics_plot
In general, the samples share very similar distributions across quality control metrics. There are few outlier cells (see nCount_RNA i.e. library size). However, we note that the log library sizes (log10_total_counts) appear to have a bimodal distribution. (Note for JSC: previous “raw_counts_good_cells.csv” likely removed the lower mode during QC, which is why the total cells here are much greater than total cells previously, ~650 total). Additionally, we see that the percent of transcripts that map to mitochondrial genes have mean of 15.8%. The outlier cells with mito% greater than 3 median-absolute-deviations (30.4365647) should be considered suspect.
Since we cannot know whether the bimodal distributions are due to low-quality vs high-quality or true biological variation for now we will assume that most cells are of acceptable quality. Under this assumption, we can test two approaches for removing putative low-quality cells. The first is to apply a flat, adjustable threshold to each sample for each QC metric. The second is a multivariate approach via principal component analysis of all the QC metrics.
Flat QC thresholds:
# Designate which QC metrics to base filter
qc_metric <- c('log10_total_features_by_counts',
'log10_total_counts',
'pct_counts_in_top_50_features',
'pct_counts_in_top_100_features',
'pct_counts_in_top_200_features',
'pct_counts_in_top_500_features',
'percent.mt')
# Calculate 4*MAD thresholds (two-way)
qc_threshold <- apply(X = dat@meta.data[qc_metric],
MARGIN = 2,
FUN = function(x) {
hi <- median(x) + 3*mad(x, constant = 1)
lo <- median(x) - 3*mad(x, constant = 1)
return(c('hi' = hi, 'lo' = lo))
})
# Determine which are outliers for each metric
tmp <- sapply(X = qc_metric,
tmp_data = dat@meta.data,
tmp_threshold = qc_threshold,
FUN = function(xx, tmp_data, tmp_threshold) {
vals <- tmp_data[[xx]]
hi_thresh <- tmp_threshold['hi', xx]
lo_thresh <- tmp_threshold['lo', xx]
outlier <- vals > hi_thresh | vals < lo_thresh
return(outlier)
})
# If a cell is above/below thresholds for any metric, consider outlier.
tmp <- apply(X = tmp, MARGIN = 1, FUN = any)
dat$outlier <- tmp
table('sample' = dat$orig.ident, 'is outlier?' = dat$outlier)
# Plot the outlier cells in violin
qc_metrics_plot <- dat@meta.data %>%
reshape2::melt(id.vars = c('orig.ident','outlier')) %>%
ggplot(mapping = aes(x = orig.ident, y = value)) +
geom_violin(scale = 'width') +
ggbeeswarm::geom_quasirandom(mapping = aes(color = ifelse(outlier, 'True','False')), size = 0.25) +
scale_color_manual(values = c('True' = 'red', 'False' = 'black')) +
facet_wrap(. ~ variable, scales = 'free_y') +
labs(title = 'Quality control metrics') +
theme(axis.title = element_blank(),
plot.title = element_text(size = 18, face = 'bold'),
axis.line.y = element_line(size = 1),
axis.ticks.y = element_line(size = 1),
legend.key = element_rect(fill = NA),
legend.position = 'bottom') +
guides(color = guide_legend(title = 'Is outlier?'))
qc_metrics_plot
## is outlier?
## sample FALSE TRUE
## S1d 315 69
## S7d 283 101
## SCtrl 328 56
Multivariate QC approach:
qc_metric <- c('log10_total_features_by_counts',
'log10_total_counts',
'pct_counts_in_top_50_features',
'pct_counts_in_top_100_features',
'pct_counts_in_top_200_features',
'pct_counts_in_top_500_features',
'percent.mt')
dat_sce <- scater::runPCA(x = dat_sce,
use_coldata = TRUE,
selected_variables = qc_metric,
detect_outliers = TRUE,
ncomponents = 2)
plot(dat_sce@reducedDims$PCA_coldata,
col = ifelse(dat_sce$outlier, 'red','black'),
sub = 'Red = discarded outlier; Black = retained')
Gene-level metrics:
scater::plotHighestExprs(dat_sce)
dat_full <- dat
dat <- dat[,dat$outlier == FALSE]
Now we do a preliminary cluster analysis to check that results are not strongly driven by what we would consider technical variation.
dat <- dat %>%
NormalizeData() %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA(npcs = 40)
# ElbowPlot(dat, ndims = 40)
dat <- dat %>%
FindNeighbors(dims = 1:20) %>%
RunTSNE(dims = 1:20) %>%
FindClusters()
dat$orig.ident <- factor(dat$orig.ident, levels = c('SCtrl','S1d','S7d'))
# DimPlot(dat, label = TRUE, group.by = 'orig.ident')
plot1 <- DimPlot(dat, group.by = 'orig.ident') +
theme(axis.line = element_line(size = 0.5),
axis.ticks = element_blank(),
axis.text = element_blank(),
axis.title = element_text(size = 10),
panel.border = element_rect(color = 'black', fill = NA))
plot2 <- DimPlot(dat, group.by = 'seurat_clusters', label = TRUE, label.size = 6) +
theme(axis.line = element_line(size = 0.5),
axis.ticks = element_blank(),
axis.text = element_blank(),
axis.title = element_text(size = 10),
panel.border = element_rect(color = 'black', fill = NA))
plot <- cowplot::plot_grid(plot1, plot2, ncol = 2, rel_widths = c(1,0.925))
plot
qc_metric <- c('log10_total_features_by_counts',
'log10_total_counts',
'pct_counts_in_top_50_features',
'pct_counts_in_top_100_features',
'pct_counts_in_top_200_features',
'pct_counts_in_top_500_features',
'percent.mt')
qc_tsne <- vector(mode = 'list', length = length(qc_metric))
names(qc_tsne) <- qc_metric
tmp <- cbind(dat@meta.data, dat[['tsne']]@cell.embeddings)
for(metric in qc_metric) {
qc_tsne[[metric]] <- tmp %>%
ggplot(mapping = aes_string(x = 'tSNE_1', y = 'tSNE_2')) +
geom_point(mapping = aes_string(color = metric)) +
scale_color_viridis_c() +
theme(axis.line = element_line(size = 0.5),
axis.ticks = element_blank(),
axis.text = element_blank(),
axis.title = element_text(size = 10),
panel.border = element_rect(color = 'black', fill = NA),
legend.title = element_blank()) +
labs(title = metric) +
guides(color = guide_colorbar(barwidth = 1,
frame.colour = 'black',
frame.linewidth = 1,
ticks.colour = 'black',
ticks.linewidth = 1))
}
qc_tsne <- cowplot::plot_grid(plotlist = qc_tsne, ncol = round(sqrt(length(qc_metric))))
qc_tsne
From these data, we observe that quality control metrics drive a significant portion of the clustering results. Specifically, cells that would be considered low quality cluster together and comprise clusters 0 and 1. Moving forward, we apply a flat QC threshold for library size and percent.mt based. These threshold values are based on previous scSeq studies.
umi_lo_threshold <- 1e3
log_umi_hi_threshold <- median(dat$log10_total_counts) + 3*mad(x = dat$log10_total_counts, constant = 1)
percent_mt_threshold <- 25
dat <- dat_full
tmp <- dat$nCount_RNA < umi_lo_threshold | dat$log10_total_counts > log_umi_hi_threshold | dat$percent.mt > percent_mt_threshold
dat$outlier <- tmp
# Plot the outlier cells in violin
qc_metrics_plot <- dat@meta.data %>%
reshape2::melt(id.vars = c('orig.ident','outlier')) %>%
ggplot(mapping = aes(x = orig.ident, y = value)) +
geom_violin(scale = 'width') +
ggbeeswarm::geom_quasirandom(mapping = aes(color = ifelse(outlier, 'True','False')), size = 0.35) +
scale_color_manual(values = c('True' = 'red', 'False' = 'black')) +
facet_wrap(. ~ variable, scales = 'free_y') +
labs(title = 'Quality control metrics') +
theme(axis.title = element_blank(),
plot.title = element_text(size = 18, face = 'bold'),
axis.line.y = element_line(size = 1),
axis.ticks.y = element_line(size = 1),
legend.key = element_rect(fill = NA),
legend.position = 'bottom') +
guides(color = guide_legend(title = 'Is outlier?'))
qc_metrics_plot
dat_full <- dat
dat <- dat[,dat$outlier == FALSE]
We repeat the basic cluster analysis.
firstup <- function(x) {
substr(x, 1, 1) <- toupper(substr(x, 1, 1))
return(x)
}
cc_genes <- firstup(tolower(readLines(con = cc_genes_path)))
s.genes <- intersect(cc_genes[1:43], rownames(dat))
g2m.genes <- intersect(cc_genes[44:97], rownames(dat))
dat <- dat %>%
NormalizeData() %>%
FindVariableFeatures() %>%
CellCycleScoring(s.features = s.genes, g2m.features = g2m.genes)
dat$CC.difference <- dat$S.Score - dat$G2M.Score
dat <- dat %>%
ScaleData(vars.to.regress = 'CC.difference') %>%
RunPCA(npcs = 40) %>%
# ElbowPlot(dat, ndims = 40)
FindNeighbors(dims = 1:15) %>%
RunTSNE(dims = 1:15) %>%
FindClusters()
dat$orig.ident <- factor(dat$orig.ident, levels = c('SCtrl','S1d','S7d'))
# DimPlot(dat, label = TRUE, group.by = 'orig.ident')
plot1 <- DimPlot(dat, group.by = 'orig.ident') +
theme(axis.line = element_line(size = 0.5),
axis.ticks = element_blank(),
axis.text = element_blank(),
axis.title = element_text(size = 10),
panel.border = element_rect(color = 'black', fill = NA))
plot2 <- DimPlot(dat, group.by = 'seurat_clusters', label = TRUE, label.size = 6) +
theme(axis.line = element_line(size = 0.5),
axis.ticks = element_blank(),
axis.text = element_blank(),
axis.title = element_text(size = 10),
panel.border = element_rect(color = 'black', fill = NA))
plot <- cowplot::plot_grid(plot1, plot2, ncol = 2, rel_widths = c(1,0.925))
plot
qc_metric <- c('log10_total_features_by_counts',
'log10_total_counts',
'pct_counts_in_top_200_features',
'percent.mt')
qc_tsne <- vector(mode = 'list', length = length(qc_metric))
names(qc_tsne) <- qc_metric
tmp <- cbind(dat@meta.data, dat[['tsne']]@cell.embeddings)
for(metric in qc_metric) {
qc_tsne[[metric]] <- tmp %>%
ggplot(mapping = aes_string(x = 'tSNE_1', y = 'tSNE_2')) +
geom_point(mapping = aes_string(color = metric)) +
scale_color_viridis_c() +
theme(axis.line = element_line(size = 0.5),
axis.ticks = element_blank(),
axis.text = element_blank(),
axis.title = element_text(size = 10),
panel.border = element_rect(color = 'black', fill = NA),
legend.title = element_blank()) +
labs(title = metric) +
guides(color = guide_colorbar(barwidth = 1,
frame.colour = 'black',
frame.linewidth = 1,
ticks.colour = 'black',
ticks.linewidth = 1))
}
qc_tsne <- cowplot::plot_grid(plotlist = qc_tsne, ncol = round(sqrt(length(qc_metric))))
qc_tsne
plot1 <- ElbowPlot(dat, ndims = 40)
dat <- JackStraw(object = dat, reduction = 'pca', dims = 40)
dat <- ScoreJackStraw(object = dat, reduction = 'pca', dims = 1:40)
plot2 <- JackStrawPlot(object = dat, dims = 1:40)
plot <- cowplot::plot_grid(plot1, plot2, ncol = 1, rel_heights = c(0.5,1))
plot
npcs <- which(dat[['pca']]@jackstraw$overall.p.values[,2] < 1e-10)
dat <- RunTSNE(object = dat, dims = npcs)
total_var <- sum(matrixStats::rowVars(x = dat[['RNA']]@scale.data))
pc_ev <- dat[['pca']]@stdev^2
explained_var <- round(pc_ev/total_var * 100, digits = 1)
tmp <- cbind(dat[['pca']]@cell.embeddings, dat@meta.data)
tmp_plots <- vector(mode = 'list', length = length(npcs))
for(i in 1:(length(npcs)-1)) {
tmp_plots[[i]] <- vector(mode = 'list', length = length(npcs))
for(j in (i+1):length(npcs)) {
dim_i <- paste0('PC_', npcs[i])
dim_j <- paste0('PC_', npcs[j])
tmp_plots[[i]][[j]] <- tmp %>%
ggplot(mapping = aes_string(y = dim_i, x = dim_j)) +
geom_point(mapping = aes(color = orig.ident), size = 1) +
geom_hline(yintercept = 0, linetype = 'dashed') +
geom_vline(xintercept = 0, linetype = 'dashed') +
theme(panel.background = element_rect(fill = NA, color = 'black'),
panel.grid = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
legend.position = 'none')
}
tmp_plots[[i]][[i]] <- cowplot::ggdraw() + cowplot::draw_label(x = 0.5, y = 0.5, paste0(dim_i, '\n', explained_var[npcs[i]]))
tmp_plots[[i]] <- cowplot::plot_grid(plotlist = tmp_plots[[i]], ncol = length(tmp_plots[[i]]))
}
# tmp_plots[[length(npcs)]] <- vector(mode = 'list', length = length(npcs) + 1)
tmp_plots[[length(npcs)]][[length(npcs)]] <- cowplot::ggdraw() + cowplot::draw_label(x = 0.5, y = 0.5, paste0('PC_', npcs[length(npcs)], '\n', explained_var[npcs[length(npcs)]]))
tmp_plots[[length(npcs)]] <- cowplot::plot_grid(plotlist = tmp_plots[[length(npcs)]], ncol = length(tmp_plots[[length(npcs)]]))
tmp_plots <- cowplot::plot_grid(plotlist = tmp_plots, ncol = 1)
tmp <- tmp %>%
ggplot(mapping = aes_string(x = dim_i, y = dim_j)) +
geom_point(mapping = aes(color = orig.ident), size = 1) +
geom_hline(yintercept = 0, linetype = 'dashed') +
geom_vline(xintercept = 0, linetype = 'dashed') +
theme(panel.background = element_rect(fill = NA, color = 'black'),
panel.grid = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
legend.key = element_rect(color = NA, fill = NA)) +
guides(color = guide_legend(title = 'Sample', override.aes = list(size = 4)))
tmp_legend <- cowplot::get_legend(tmp)
tmp_plots <- cowplot::plot_grid(tmp_plots, tmp_legend, rel_widths = c(1, 0.05))
tmp_plots
# distance matrix in PCA-space
dat_dist <- dist(x = dat[['pca']]@cell.embeddings[,npcs])
# Assign values of k.param to iterate
k_range <- seq(from = 5, to = 35, by = 5) # sqrt(ncol(dat)) == 25.6907
names(k_range) <- paste('k', k_range, sep ='_')
k_results <- vector(mode = 'list', length = length(k_range))
names(k_results) <- names(k_range)
# Cluster data through k.params using a set louvain resolution
for(k in 1:length(k_range)) {
tmp <- FindNeighbors(object = dat, k.param = k_range[k], dims = npcs)
tmp <- FindClusters(tmp, resolution = 0.8, algorithm = 3)
k_results[[names(k_range)[k]]] <- tmp@active.ident
}
k_results <- do.call(cbind, k_results)
# Compute silhouette coefficients for clustering results
silhouette_val <- vector(mode = 'list', length = ncol(k_results))
names(silhouette_val) <- colnames(k_results)
for(k in 1:ncol(k_results)) {
silhouette_val[[colnames(k_results)[k]]] <- cluster::silhouette(x = k_results[,k], dist = dat_dist)
}
# Plot silhouette coefficients
tiff(filename = paste0(results_outpath, 'SilhouettePlots_kparamTesting.tiff'), height = 2400, width = 2440, res = 220)
plot_dim <- ceiling(sqrt(length(silhouette_val)))
{
par(mfrow = c(plot_dim, plot_dim))
for(k in 1:length(silhouette_val)) {
cat(plot(silhouette_val[[k]], border = NA, main = paste(names(silhouette_val)[k], '; res = 0.8'), cex = 1.5))
}
}
dev.off()
dat <- FindNeighbors(object = dat, dims = npcs, k.param = 15)
Now iterate through various louvain resolutions.
# distance matrix in PCA-space
dat_dist <- dist(x = dat[['pca']]@cell.embeddings[,npcs])
res_range <- c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0)
names(res_range) <- paste('res', res_range, sep ='_')
res_results <- vector(mode = 'list', length = length(res_range))
names(res_results) <- names(res_range)
# Cluster data through k.params using a set louvain resolution
for(r in 1:length(res_range)) {
tmp <- FindClusters(dat, resolution = res_range[r], algorithm = 3)
res_results[[names(res_range)[r]]] <- tmp@active.ident
}
res_results <- do.call(cbind, res_results)
# Compute silhouette coefficients for clustering results
silhouette_val <- vector(mode = 'list', length = ncol(res_results))
names(silhouette_val) <- colnames(res_results)
for(r in 1:ncol(res_results)) {
silhouette_val[[colnames(res_results)[r]]] <- cluster::silhouette(x = res_results[,r], dist = dat_dist)
}
# Plot silhouette coefficients
tiff(filename = paste0(results_outpath, 'SilhouettePlots_resTesting.tiff'), height = 2400, width = 2440, res = 220)
plot_dim <- ceiling(sqrt(length(silhouette_val)))
{
par(mfrow = c(ceiling(length(res_range)/plot_dim), plot_dim))
for(r in 1:length(silhouette_val)) {
cat(plot(silhouette_val[[r]], border = NA, main = paste('k.param = 15;', names(silhouette_val)[r]), cex = 1.5))
}
}
dev.off()
dat <- FindNeighbors(object = dat, dims = npcs, k.param = 15)
dat <- FindClusters(object = dat, resolution = 0.8)
dat <- RunTSNE(object = dat)
plot1 <- DimPlot(dat, group.by = 'orig.ident', reduction = 'tsne', pt.size = 2) +
theme(axis.line = element_line(size = 0.5),
axis.ticks = element_blank(),
axis.text = element_blank(),
axis.title = element_text(size = 10),
panel.border = element_rect(color = 'black', fill = NA))
plot2 <- DimPlot(dat, group.by = 'seurat_clusters', label = TRUE, label.size = 6, reduction = 'tsne', pt.size = 2) +
theme(axis.line = element_line(size = 0.5),
axis.ticks = element_blank(),
axis.text = element_blank(),
axis.title = element_text(size = 10),
panel.border = element_rect(color = 'black', fill = NA))
plot <- cowplot::plot_grid(plot1, plot2, ncol = 2, rel_widths = c(1,0.925))
plot
total_var <- sum(matrixStats::rowVars(x = dat[['RNA']]@scale.data))
pc_ev <- dat[['pca']]@stdev^2
explained_var <- round(pc_ev/total_var * 100, digits = 1)
color_by <- 'seurat_clusters'
tmp <- cbind(dat[['pca']]@cell.embeddings, dat@meta.data)
tmp_plots <- vector(mode = 'list', length = length(npcs))
for(i in 1:(length(npcs)-1)) {
tmp_plots[[i]] <- vector(mode = 'list', length = length(npcs))
for(j in (i+1):length(npcs)) {
dim_i <- paste0('PC_', npcs[i])
dim_j <- paste0('PC_', npcs[j])
tmp_plots[[i]][[j]] <- tmp %>%
ggplot(mapping = aes_string(y = dim_i, x = dim_j)) +
geom_point(mapping = aes_string(color = color_by), size = 1) +
geom_hline(yintercept = 0, linetype = 'dashed') +
geom_vline(xintercept = 0, linetype = 'dashed') +
theme(panel.background = element_rect(fill = NA, color = 'black'),
panel.grid = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
legend.position = 'none')
}
tmp_plots[[i]][[i]] <- cowplot::ggdraw() + cowplot::draw_label(x = 0.5, y = 0.5, paste0(dim_i, '\n', explained_var[npcs[i]]))
tmp_plots[[i]] <- cowplot::plot_grid(plotlist = tmp_plots[[i]], ncol = length(tmp_plots[[i]]))
}
# tmp_plots[[length(npcs)]] <- vector(mode = 'list', length = length(npcs) + 1)
tmp_plots[[length(npcs)]][[length(npcs)]] <- cowplot::ggdraw() + cowplot::draw_label(x = 0.5, y = 0.5, paste0('PC_', npcs[length(npcs)], '\n', explained_var[npcs[length(npcs)]]))
tmp_plots[[length(npcs)]] <- cowplot::plot_grid(plotlist = tmp_plots[[length(npcs)]], ncol = length(tmp_plots[[length(npcs)]]))
tmp_plots <- cowplot::plot_grid(plotlist = tmp_plots, ncol = 1)
tmp <- tmp %>%
ggplot(mapping = aes_string(x = dim_i, y = dim_j)) +
geom_point(mapping = aes_string(color = color_by), size = 1) +
geom_hline(yintercept = 0, linetype = 'dashed') +
geom_vline(xintercept = 0, linetype = 'dashed') +
theme(panel.background = element_rect(fill = NA, color = 'black'),
panel.grid = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
legend.key = element_rect(color = NA, fill = NA)) +
guides(color = guide_legend(title = 'Cluster', override.aes = list(size = 4)))
tmp_legend <- cowplot::get_legend(tmp)
tmp_plots <- cowplot::plot_grid(tmp_plots, tmp_legend, rel_widths = c(1, 0.05))
tmp_plots
markers <- FindAllMarkers(object = dat, assay = 'RNA', slot = 'data', test.use = 'wilcox', only.pos = TRUE)
saveRDS(object = markers, file = paste0(results_outpath, 'graph_de_markers.rds'))
These are globally DE genes i.e. each cluster vs all other cells combined. DE is calculated using a Wilcoxon Rank sum test, which is a conservative approach (low false-positives but high false-negatives).
markers <- readRDS(file = paste0(results_outpath, 'graph_de_markers.rds'))
markers <- markers[c(6,7,2,5,1,3,4)]
markers[c('p_val_adj','avg_logFC','p_val','pct.1','pct.2')] <- signif(markers[c('p_val_adj','avg_logFC','p_val','pct.1','pct.2')]
, digits = 3)
DT::datatable(data = markers, rownames = FALSE, extensions = c('Scroller','Buttons'), options = list(scroller = TRUE, deferRender = TRUE, scrollY = 350, fixedColumns = TRUE, autoWidth = TRUE, pageLength = 10, dom = 'Bfrtip', buttons = list(extend = 'collection', buttons = c('csv', 'excel', 'pdf'))))
Heatmap of the top 7 DE genes for each cluster. Some clusters have overlapping globally-DE genes (e.g. cluster 4 with clusters 0 and 6).
markers <- readRDS(file = paste0(results_outpath, 'graph_de_markers.rds'))
ngenes <- 7
marker.genes <- c()
for(c in levels(markers$cluster)) {
tmp_results <- markers[markers$cluster == c & markers$p_val_adj < 1e-03,]
tmp_genes <- tmp_results[order(tmp_results$avg_logFC, decreasing = TRUE),]$gene
genes_index <- which(!(tmp_genes %in% marker.genes) & tmp_genes != 'AA467197')[1:ngenes]
marker.genes <- c(marker.genes, tmp_genes[genes_index])
}
names(marker.genes) <- rep(levels(markers$cluster), each = ngenes)
Idents(dat) <- 'seurat_clusters'
DefaultAssay(dat) <- 'RNA'
tmp <- ScaleData(dat, features = marker.genes)
avg.exp <- FetchData(tmp, vars = c('seurat_clusters', marker.genes), slot = 'scale.data') %>%
reshape2::melt(id.vars = c('seurat_clusters')) %>%
group_by(seurat_clusters, variable) %>%
summarise(avg.exp = mean(value))
for(c in levels(markers$cluster)) {
tmp <- avg.exp[avg.exp$seurat_clusters == c & avg.exp$variable %in% marker.genes[names(marker.genes) == c],]
tmp <- tmp[order(tmp$avg.exp, decreasing = TRUE),]$variable
marker.genes[names(marker.genes) == c] <- as.character(tmp)
}
avg.exp <- avg.exp %>% mutate(variable = factor(variable, levels = rev(marker.genes)))
paletteLength <- 200
max.z <- 2
myColor <- rev(colorRampPalette(RColorBrewer::brewer.pal(n = 9, name = 'RdBu'))(paletteLength))
myBreaks <- c(seq(min(avg.exp$avg.exp), 0, length.out=ceiling(paletteLength/2) + 1),
seq(min(max.z, max(avg.exp$avg.exp))/paletteLength, min(max.z, max(avg.exp$avg.exp)), length.out=floor(paletteLength/2)))
marker.heatmap <- avg.exp %>%
ggplot(mapping = aes(x = seurat_clusters, y = variable, fill = avg.exp)) +
geom_tile(color = 'black', size = 0.25) +
scale_x_discrete(expand = c(0,0)) +
scale_y_discrete(expand = c(0,0)) +
scale_fill_gradientn(colors = myColor,
values = scales::rescale(x = myBreaks, to = c(0,1)),
limits = c(NA, max.z), na.value = myColor[paletteLength], breaks = seq(-5,5,0.5)) +
theme(axis.text.x = element_text(size = 12),
axis.title = element_blank()) +
guides(fill = guide_colorbar(title = 'Expression\nz-score', barwidth = 1.25, frame.colour = 'black', frame.linewidth = 1, ticks.colour = 'black', ticks.linewidth = 1))
marker.heatmap
ggsave(filename = paste0(results_outpath, 'graph_cluster_markers_heatmap.tiff'), plot = marker.heatmap, device = 'tiff', height = 9, width = 4)
Next we explore the population changes for each of the clusters between the 3 conditions. We calculate the absolute numbers of cells per cluster as well as the proportion out of all cells for each condition.
counts <- data.frame(table(dat$seurat_clusters, dat$orig.ident))
names(counts) <- c('seurat_clusters','orig.ident','count')
numbers <- counts %>%
ggplot(mapping = aes(x = orig.ident, y = count, group = seurat_clusters)) +
geom_bar(aes(fill = seurat_clusters), stat = 'identity', color = 'black', size = 0.75) +
scale_y_continuous() +
scale_x_discrete() +
ylab('# of cells') +
theme(axis.title.x = element_blank(),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(size = 12),
axis.line = element_line(size = 1),
panel.background = element_rect(fill = NA),
axis.text.y = element_text(size = 12))
props <- counts %>%
ggplot(mapping = aes(x = orig.ident, y = count, group = seurat_clusters)) +
geom_bar(aes(fill = seurat_clusters), stat = 'identity', color = 'black', size = 0.75, position = 'fill') +
scale_y_continuous() +
scale_x_discrete() +
ylab('% of cells') +
theme(axis.title.x = element_blank(),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(size = 12),
axis.line = element_line(size = 1),
panel.background = element_rect(fill = NA),
axis.text.y = element_text(size = 12),
legend.title = element_blank(),
legend.text = element_text(size = 12))
legend <- cowplot::get_legend(plot = props)
pop.dynamics <- cowplot::plot_grid(numbers + theme(legend.position = 'none'), props + theme(legend.position = 'none'), legend, ncol = 3, rel_widths = c(1, 1, 0.25))
pop.dynamics
ggsave(filename = paste0(results_outpath, 'population_dynamics.tiff'), plot = pop.dynamics, device = 'tiff', height = 3.75, width = 8)
sessionInfo()